Skunkware 5

home *** CD-ROM | disk | FTP | other *** search

/ Skunkware 5 / Skunkware 5.iso / src / Tools / libwais / ir / ircfiles.c < prev next >

Wrap

C/C++ Source or Header | 1995-05-03 | 45.0 KB | 2,027 lines

/* WIDE AREA INFORMATION SERVER SOFTWARE: No guarantees or restrictions. See the readme file for the full standard disclaimer. Brewster@think.com */ /* this file defines a set of helper functions * for indexing common types of files. * -brewster 7/90 */ /* I encourage adding customizations. * (too bad they all have to be hard coded, but * C did not have convenient dynamic linking facilities) * * Add three functions to this file: * boolean foo_separator_function(char *line){} * void foo_header_function(char *line){} * long foo_date_function(char *line){} * void foo_finish_header_function(char *header){} * * then add the prototypes to ircfiles.h * then add the functions to the big case statement in irbuild.c * * * to do: * filter for digests * * Tracy pointed out 2 things which we should consider when redesigning the * parser: * * - there should be a way for the parser to decide to skip a section of * input text (ie. not index it). she does this by having global variable * which is set by her custom seperator function when it wants to tell * map_over_words() to not add the words on the current line * * - there should be a way to switch lexers depending what section of a * document you are in (since word separators will change). This is * needed by the european patent office too. * */ /* Change log: * 8/90 brewster added the library customizations * 6/91 and before - added a bunch of other filters - JG * $Log: ircfiles.c,v $ * Revision 1.34 92/05/06 17:28:23 jonathan * Added filename_finish_header_function. Puts leaf name into header. * * Revision 1.33 92/05/05 11:10:50 jonathan * Added fix to bibtex indexer to ignore subsequent "booktitles" after title * has been set. Thanks to Lutz Prechelt (prechelt@ira.uka.de). * * Revision 1.32 92/04/30 12:31:08 jonathan * Fixed syntax errors in OBJ C functions. * * Revision 1.31 92/04/29 14:08:57 shen * chnage catalaog header string to "Title:" * * Revision 1.30 92/04/26 14:45:08 brewster * debug ziff * * Revision 1.29 92/04/26 14:39:24 brewster * tweeked ziff filter * * Revision 1.28 92/04/25 21:14:05 brewster * added ziff * * Revision 1.27 92/04/20 15:21:06 morris * added todo's for tracy * * Revision 1.26 92/03/22 18:38:29 brewster * added objective C filter * * Revision 1.25 92/03/13 08:21:37 jonathan * Added length limits to scanf's in my_getdate, thanks to * sendall@dxpt01.cern.ch (Mike Sendall). * * Revision 1.24 92/02/29 20:13:54 jonathan * separated =- for some compilers that get confused (ULTRIX). * * Revision 1.23 92/02/20 09:50:14 jonathan * Added bibtex and nhyp filters from S.P.vandeBurgt@research.ptt.nl. * * Revision 1.22 92/02/12 13:11:25 jonathan * Changed library catalog functions for new format (from fad). * * * */ #include <string.h> #include <ctype.h> #include "cutil.h" #include "ircfiles.h" extern char *current_filename; extern int current_filecount; #define MAX_HEADER_LEN 100 #define MAX_AUTHOR_LEN 25 #define MAX_DATE_LEN 4 static char* trim_trailing_newline _AP((char* string)); static char* trim_trailing_newline(string) char* string; { if(string) if(strlen(string) > 0) if(string[strlen(string) -1] == '\n') string[strlen(string) -1] = '\0'; return(string); } /* ================================= * === Groliers Customizations === * ================================= */ boolean groliers_separator_function(line) char *line; { if((strlen(line) > strlen("ARTICLE")) && substrcmp(line, "ARTICLE")){ /* printf("hit %s\n", line); */ return(true); } else{ return(false); } } char groliers_header[MAX_HEADER_LEN + 1]; void groliers_header_function(line) char *line; { if(groliers_separator_function(line)){ s_strncpy(groliers_header, line + strlen("ARTICLE") + 2, MAX_HEADER_LEN); } } void groliers_finish_header_function(header) char *header; { if(strlen(groliers_header) == 0){ s_strncpy(header, "Unknown Title", MAX_HEADER_LEN); } else{ s_strncpy(header, groliers_header, MAX_HEADER_LEN); } groliers_header[0] = '\0'; } /* ============================== * === RMail Customizations === * ============================== */ /* this is just a preliminary version. A good version would * produce a headline like gnu emacs RMAIL */ boolean mail_separator_function(line) char *line; { /* this should really look for a "<cr><cr>From " rather than "<cr>From " */ if((strlen(line) > strlen("From ")) && substrcmp(line, "From ")){ return(true); } else{ return(false); } } boolean rmail_separator_function(line) char *line; { if(0 == strcmp(line, "\n")){ return(true); } else{ return(false); } } /* This one is portable, but might get the wrong answer. I'm open to better code. - Jonny G */ static char *months[] = {"Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", NULL}; long my_getdate(line) char *line; { char date[255], *temp; int day, month, year; char cmonth[25], dow[5], tod[10]; strcpy(date, line); temp = date; while(!isdigit(*temp)) temp++; sscanf(temp, "%d %25s %d", &day, cmonth, &year); for(month = 0; months[month] != NULL; month++) if(!strcmp(cmonth, months[month])) break; if (year > 99) year = year % 100; if(day > 0 && month < 12 && year > 0) { return (10000 * year + 100 * (month+1) + day); } month = -1; day = -1; year = -1; sscanf(temp, "%d/%d/%d", &month, &day, &year); if (year > 99) year = year % 100; if(day > 0 && month < 12 && year > 0) { return (10000 * year + 100 * (month+1) + day); } month = -1; day = -1; year = -1; sscanf(temp, "%d/%d/%d", &year, &month, &day); if (year > 99) year = year % 100; if(day > 0 && month < 12 && year > 0) { return (10000 * year + 100 * (month+1) + day); } temp = date; sscanf(temp, "%5s %25s %d %10s %d", dow, cmonth, &day, tod, &year); for(month = 0; months[month] != NULL; month++) if(!strcmp(cmonth, months[month])) break; if (year > 99) year = year % 100; if(day > 0 && month < 12 && year > 0) { return (10000 * year + 100 * (month+1) + day); } return 0; } long mail_date_function(line) char *line; { if((strlen(line) > strlen("Date: ")) && substrcmp(line, "Date: ")){ return(my_getdate(line+6)); } else if((strlen(line) > strlen("From ")) && substrcmp(line, "From ")){ char *p; p = (char*)index(line+5, ' '); if(p != NULL) return(my_getdate(p+1)); } else return -1; } char mail_subject[MAX_HEADER_LEN + 1]; char mail_from[MAX_HEADER_LEN + 1]; void mail_header_function(line) char *line; { if((strlen(line) > strlen("Subject: ")) && substrcmp(line, "Subject: ") && (strlen(mail_subject) == 0)){ strcpy(mail_subject, "Re: "); s_strncat(mail_subject, line + strlen("Subject: "), MAX_HEADER_LEN, MAX_HEADER_LEN); trim_trailing_newline(mail_subject); } else if((strlen(line) > strlen("From: ")) && substrcmp(line, "From: ") && (strlen(mail_from) == 0)){ /* this should find the <foo@bar> field in the from list */ s_strncpy(mail_from, line + strlen("From: "), MAX_HEADER_LEN); trim_trailing_newline(mail_from); } } void mail_finish_header_function(header) char *header; { if(strlen(mail_subject) != 0 && strlen(mail_from) != 0){ /* trim the from line if needed */ if(strlen(mail_from) > 10){ mail_from[10] = '\0'; } s_strncpy(header, mail_from, MAX_HEADER_LEN); s_strncat(header, " ", MAX_HEADER_LEN, MAX_HEADER_LEN); s_strncat(header, mail_subject, MAX_HEADER_LEN, MAX_HEADER_LEN); } else if(strlen(mail_subject) != 0){ s_strncpy(header, mail_subject, MAX_HEADER_LEN); } else if(strlen(mail_from) != 0){ s_strncpy(header, mail_from, MAX_HEADER_LEN); } else{ strcpy(header, "Unknown Subject"); } /* printf("%s\n", header); lots of output !! */ mail_from[0] = '\0'; mail_subject[0] = '\0'; } boolean mail_or_rmail_separator(line) char *line; { static boolean blank_line = false; if((strlen(line) > strlen("From ")) && substrcmp(line, "From ") && blank_line == true){ blank_line = false; return(true); } if(substrcmp(line, "")){ blank_line = true; return(true); } if(!strcmp(line, "\n")){ blank_line = true; } else{ blank_line = false; } return(false); } /* ======================================== * === MMDF Mail folder Customizations ==== * ======================================== ^A^A^A^A body ^A^A^A^A ^A^A^A^A next body ^A^A^A^A */ boolean mmdf_separator_function(line) char *line; { static boolean saw_start = false; if ( substrcmp(line, "") ) { if( saw_start == true) saw_start = false; /* the second one is the end marker */ else saw_start = true; if (saw_start) { /*printf(" mmdf_separator_function returns true\n"); */ return (true); } } /* mix in osf mail archives which are seperated by "=" on a line * by itself */ if ( substrcmp(line, "=") ) { return (true); } return (false); #if 0 if((strlen(line) > strlen("From ")) && substrcmp(line, "From ") && blank_line == true){ blank_line = false; return(true); } if(substrcmp(line, "")){ blank_line = true; return(true); } if(!strcmp(line, "\n")){ blank_line = true; } else{ blank_line = false; } return(false); #endif } /* ======================================== * === Mail Digest Customizations ==== * ======================================== */ boolean mail_digest_separator_function(line) char *line; { if((strlen(line) > strlen("-----------------------------")) && substrcmp(line, "------------------------------")){ return(true); } else{ return(false); } } /* ======================================== * === Library Catalog Customizations === * ======================================== */ #define TITLE_MARKER "Title: " #define FIRST_LINE_MARKER "Call No...." /* just use the title */ boolean catalog_separator_function(line) char *line; { if (strstr(line, FIRST_LINE_MARKER)) { return(true); } else{ return(false); } } char catalog_header[MAX_HEADER_LEN + 1]; void catalog_header_function(line) char *line; { char * title_start; if (title_start = strstr(line, TITLE_MARKER)) { strncpy(catalog_header, title_start + strlen(TITLE_MARKER), MAX_HEADER_LEN); } } void catalog_finish_header_function(header) char *header; { if(strlen(catalog_header) == 0){ strcpy(header, "Unknown Title"); } else{ s_strncpy(header, catalog_header, MAX_HEADER_LEN); } catalog_header[0] = '\0'; } /* ============================ * === Bio Customizations === * ============================ */ /* customizations for a DB of genetic abstracts */ boolean hit_header = false; boolean bio_separator_function(line) char *line; { if((strlen(line) > strlen(">>>")) && substrcmp(line, ">>>")){ return(true); } else{ return(false); } } char bio_header[MAX_HEADER_LEN + 1]; void bio_header_function(line) char *line; { if(hit_header /* we just hit a seperator previous to this */ && (!bio_separator_function(line)) /* we are not on the separator now */ && strlen(bio_header) == 0){ /* and we have not saved the headline yet */ strcpy(bio_header, line); waislog(WLOG_MEDIUM, WLOG_INDEX, "storing line: %s", bio_header); hit_header = false; } } void bio_finish_header_function(header) char *header; { hit_header = true; /* turn on the flag */ if(strlen(bio_header) == 0){ strcpy(header, "Unknown Title"); } else{ strcpy(header, bio_header); } bio_header[0] = '\0'; } /* ================================= * === CMApp Customizations === * ================================= */ boolean cmapp_separator_function(line) char *line; { if((strlen(line) > strlen("@A")) && substrcmp(line, "@A")){ /* printf("hit %s\n", line); */ return(true); } else{ return(false); } } char cmapp_header[MAX_HEADER_LEN + 1]; void cmapp_header_function(line) char *line; { if((strlen(line) > strlen("APPLICATION:")) && substrcmp(line, "APPLICATION:")){ /* printf("hit %s\n", line); */ s_strncpy(cmapp_header, line + strlen("APPLICATION:"), MAX_HEADER_LEN); } } void cmapp_finish_header_function(header) char *header; { if(strlen(cmapp_header) == 0){ s_strncpy(header, "Unknown Title", MAX_HEADER_LEN); } else{ s_strncpy(header, cmapp_header, MAX_HEADER_LEN); } cmapp_header[0] = '\0'; } /* ================================= * === Jargon Customizations === * ================================= * * GW - updated for Jargon File 2.9.8 */ /* Format of an entry: [blank line] :Title of This entry: first line of text of this entry second line of text of this entry third line of text of this entry [blank line] Any line which starts with a colon is considered to be the beginning of an entry. -GW */ static int jargon_seen_entry = 0; boolean jargon_separator_function(line) register char *line; { if(!jargon_seen_entry && line[0] == ':') jargon_seen_entry = 1; return line[0] == ':'; } char jargon_header[MAX_HEADER_LEN + 1]; void jargon_header_function(line) char *line; { if(line[0] != ':') return; strncpy(jargon_header,line+1,MAX_HEADER_LEN); jargon_header[MAX_HEADER_LEN] = '\0'; if(NULL != (line = strchr(jargon_header,':'))){ if(line[1] == ':') line++; line++; line[0] = '\0'; } } void jargon_finish_header_function(header) char *header; { if(jargon_seen_entry) { strncpy(header, jargon_header, MAX_HEADER_LEN); } jargon_header[0] = '\0'; } /* ================================= * === Internet Resource Guide === * ================================= */ char irg_header[MAX_HEADER_LEN + 1]; boolean irg_header_set = FALSE; boolean irg_separator_function(line) char *line; { if(line[0] == 12){ /* control L */ irg_header_set = FALSE; return(true); } else return(false); } void irg_header_function(line) char *line; { if((irg_header_set == FALSE) && (line[0] == 32 )){ /* space */ s_strncpy(irg_header, line + strspn(line, " "), MAX_HEADER_LEN); irg_header_set = TRUE; } } void irg_finish_header_function(header) char *header; { if(strlen(irg_header) == 0){ s_strncpy(header, "Unknown Title", MAX_HEADER_LEN); } else{ s_strncpy(header, irg_header, MAX_HEADER_LEN); } irg_header[0] = '\0'; irg_header_set = FALSE; } /* ======================== * === Dash Separator === * ======================== */ /* * dash-seperate entries * used in Introduction to Algorithms bug.list, suggestions, etc. * --------------------... at least 20 dashes * header * item * .. * --------------------... at least 20 dashes */ boolean dash_separator_function(line) char *line; { if((strlen(line) > 20) && substrcmp(line,"--------------------")){ /* printf("hit %s\n", line); */ return(true); } else{ return(false); } } char dash_header[MAX_HEADER_LEN + 1]; void dash_header_function(line) char *line; { if(!dash_separator_function(line) && (strlen(dash_header) < (MAX_HEADER_LEN - 1))){ s_strncat(dash_header, line, MAX_HEADER_LEN, MAX_HEADER_LEN); trim_trailing_newline(dash_header); strncat(dash_header, " ", MAX_HEADER_LEN); } } void dash_finish_header_function(header) char *header; { if (strlen(dash_header) == 0) { strcpy(header, "No Title"); } else { s_strncpy(header, dash_header, MAX_HEADER_LEN); } dash_header[0] = '\0'; } /* ============================ * === one_line Separator === * ============================ */ /* this is where each line is a document (good for databases) */ boolean one_line_hit_header = false; boolean one_line_separator_function(line) char *line; { return(true); } char one_line_header[MAX_HEADER_LEN + 1]; void one_line_header_function(line) char *line; { s_strncpy(one_line_header, line, MAX_HEADER_LEN); } void one_line_finish_header_function(header) char *header; { if (strlen(one_line_header) == 0) { strcpy(header, "No Title"); } else { s_strncpy(header, one_line_header, MAX_HEADER_LEN); } one_line_header[0] = '\0'; } /* ============================= * === Paragraph Separator === * ============================= */ /* paragraph files - seperated by a blank line. Next line is the header */ char para_header[MAX_HEADER_LEN +1]; static boolean para_start = true; boolean para_separator_function(line) char *line; { if (para_start == true) { para_start = false; return true; } if (strlen(line) < 2) para_start = true; return false; } void para_header_function(line) char *line; { if (para_header[0] == 0) s_strncpy(para_header, line, MAX_HEADER_LEN); } void para_finish_header_function(header) char *header; { if (strlen(para_header) == 0) { strcpy(header, "No Title"); } else { s_strncpy(header, para_header, MAX_HEADER_LEN); } para_header[0] = 0; } /* ========================== * === Seeker Separator === * ========================== */ boolean seeker_separator_function(line) char *line; { return(dash_separator_function(line)); } char seeker_header[MAX_HEADER_LEN + 1]; boolean in_headline = FALSE; void seeker_header_function(line) char *line; { if(strlen(line) > strlen("Headline:") && substrcmp(line, "Headline:")){ in_headline = TRUE; seeker_header[0] = '\0'; /* printf("hit headline!\n"); */ } else if(in_headline == TRUE && (strlen(seeker_header) < (MAX_HEADER_LEN - 1))){ s_strncat(seeker_header, line, MAX_HEADER_LEN, MAX_HEADER_LEN); trim_trailing_newline(seeker_header); } } void seeker_finish_header_function(header) char *header; { if (strlen(seeker_header) == 0) { strcpy(header, "No Title"); } else { s_strncpy(header, seeker_header, MAX_HEADER_LEN); } seeker_header[0] = '\0'; in_headline = TRUE; } /* ========================== * === RLIN Separator === * ========================== */ boolean rlin_separator_function(line) char *line; { return(dash_separator_function(line)); } char rlin_header[MAX_HEADER_LEN + 1]; boolean rlin_in_headline = FALSE; void rlin_header_function(line) char *line; { if(rlin_separator_function(line)){ rlin_in_headline = TRUE; rlin_header[0] = '\0'; /* printf("hit headline!\n"); */ } else if(rlin_in_headline == TRUE && (strlen(rlin_header) < (MAX_HEADER_LEN - 1))){ s_strncat(rlin_header, line, MAX_HEADER_LEN, MAX_HEADER_LEN); trim_trailing_newline(rlin_header); } } void rlin_finish_header_function(header) char *header; { if (strlen(rlin_header) == 0) { strcpy(header, "No Title"); } else { s_strncpy(header, rlin_header, MAX_HEADER_LEN); } rlin_header[0] = '\0'; in_headline = TRUE; } /* ======================================== * === MH_BBoard Customizations ==== * ======================================== */ /* gcardwel@uci.edu MH bboards use a series of control A's to do a blank line.. yuk! */ boolean mh_bboard_separator_function(line) char *line; { static boolean blank_line = false; if((strlen(line) > strlen("BBoard-ID: ")) && substrcmp(line, "BBoard-ID: ") && blank_line == true){ blank_line = false; return(true); } if(!strcmp(line, "\001\001\001\001\n")){ blank_line = true; } else{ blank_line = false; } return (false); } /* * Customization for files saved from within the 'rn' newsreader. * * These can either be in 'mail' format, or they can be in a similar * format which starts each article with the pseudo-header * 'Article: 42 of comp.sys.foobar'. Other than that, we treat this * just like 'mail'. * * wollman@uvm.edu, Sun Sep 8 20:12:21 EDT 1991 */ boolean rn_separator_function(line) char *line; { if(!strncmp(line,"From ",5) || !strncmp(line,"Article ",7) || !strncmp(line,"Article: ",9)) return true; return false; } /* * Customizations for GNU Emacs Info files * * When indexing info files, the user must index the files with real text * in them, rather than the file with the tag and indirect tables; otherwise * you'll end up with lots of garbage in your index. * * G. Wollman */ static int done_headline = 0; boolean emacs_info_separator_function(line) /* hate K&R-style definitions */ char *line; { if(line[0] == (char)31) { done_headline = 0; return true; } return false; } static char emacs_info_headline[MAX_HEADER_LEN+1]; void emacs_info_header_function(line) register char *line; { int i; if(done_headline) return; if(strncmp(line,"File: ",6)) return; done_headline = 1; line += 6; /* skip over "File: " */ i = 1; emacs_info_headline[0] = '('; while(*line && *line != ',' && (i < MAX_HEADER_LEN-1)) emacs_info_headline[i++] = *line++; emacs_info_headline[i++] = ')'; line += 9; /* skip over ", Node: " */ /* copy the name of the info node into the headline */ while(*line && (i < MAX_HEADER_LEN) && (*line != ',')) emacs_info_headline[i++] = *line++; emacs_info_headline[i++] = '\0'; } void emacs_info_finish_header_function(header) char *header; { strcpy(header,emacs_info_headline); } /* ======================================== * === Medline Customizations ==== * ======================================== */ /* Francois Schiettecatte with help from: Tom Emmel Karen Phipps */ char medline_header[MAX_HEADER_LEN +1]; char medline_title[MAX_HEADER_LEN + 1]; char medline_date[MAX_HEADER_LEN + 1]; char medline_author[MAX_HEADER_LEN + 1]; static boolean medline_start = true; boolean medline_separator_function(line) char *line; { if (medline_start == true) { medline_start = false; return true; } if (strlen(line) < 2) medline_start = true; return false; } void medline_header_function(line) char *line; { char *ptr; if((strlen(line) > strlen("TI ")) && (substrcmp(line, "TI "))){ strncpy(medline_title, line + strlen("TI "), MAX_HEADER_LEN); } if((strlen(line) > strlen("SO ")) && (substrcmp(line, "SO "))){ ptr = strchr(line,'1'); strncpy(medline_date, ptr, MAX_DATE_LEN); } if((strlen(line) > strlen("AU ")) && (substrcmp(line, "AU "))){ ptr = strtok(line + strlen("AU "),".,"); strcpy(medline_author,ptr); strncat(medline_author, " ", MAX_AUTHOR_LEN); } } void medline_finish_header_function(header) char *header; { if(strlen(medline_author) > 0 ){ strncat(medline_header,medline_author, MAX_HEADER_LEN); } if(strlen(medline_date) > 0 ){ strncat(medline_header,"(", MAX_HEADER_LEN); strncat(medline_header,medline_date, MAX_HEADER_LEN); strncat(medline_header,") ", MAX_HEADER_LEN); } if(strlen(medline_title) > 0 ){ strncat(medline_header,medline_title, MAX_HEADER_LEN); } if(strlen(medline_header) == 0){ strcpy(header, "No Title"); } else{ strncpy(header, medline_header, MAX_HEADER_LEN); } medline_header[0] = '\0'; medline_title[0] = '\0'; medline_date[0] = '\0'; medline_author[0] = '\0'; } /* ======================================== * === Refer Customizations ==== * ======================================== */ /* Francois Schiettecatte with help from: Tom Emmel Karen Phipps */ char refer_header[MAX_HEADER_LEN +1]; char refer_title[MAX_HEADER_LEN + 1]; char refer_date[MAX_HEADER_LEN + 1]; char refer_author[MAX_HEADER_LEN + 1]; static boolean refer_start = true; boolean refer_separator_function(line) char *line; { if (refer_start == true) { refer_start = false; return true; } if (strlen(line) < 2) refer_start = true; return false; } void refer_header_function(line) char *line; { if((strlen(line) > strlen("%T ")) && (substrcmp(line, "%T "))){ strncpy(refer_title, line + strlen("%T "), MAX_HEADER_LEN); } else if((strlen(line) > strlen("%B ")) && (substrcmp(line, "%B ")) && (strlen(refer_title) == 0)){ strncpy(refer_title, line + strlen("%B "), MAX_HEADER_LEN); } if((strlen(line) > strlen("%D ")) && (substrcmp(line, "%D "))){ strncpy(refer_date, line + strlen("%D "), MAX_DATE_LEN); } if((strlen(line) > strlen("%A ")) && (substrcmp(line, "%A ")) && (strlen(refer_author) == 0)){ strncpy(refer_author, line + strlen("%A "), MAX_AUTHOR_LEN); strncat(refer_author, " ", MAX_AUTHOR_LEN); } else if((strlen(line) > strlen("%E ")) && (substrcmp(line, "%E ")) && (strlen(refer_author) == 0)){ strncpy(refer_author, line + strlen("%E "), MAX_AUTHOR_LEN); strncat(refer_author, " ", MAX_AUTHOR_LEN); } } void refer_finish_header_function(header) char *header; { if(strlen(refer_author) > 0 ){ strncat(refer_header,refer_author, MAX_HEADER_LEN); } if(strlen(refer_date) > 0 ){ strncat(refer_header,"(", MAX_HEADER_LEN); strncat(refer_header,refer_date, MAX_HEADER_LEN); strncat(refer_header,") ", MAX_HEADER_LEN); } if(strlen(refer_title) > 0 ){ strncat(refer_header,refer_title, MAX_HEADER_LEN); } if(strlen(refer_header) == 0){ strncpy(header, "No Title", MAX_HEADER_LEN); } else{ strncpy(header, refer_header, MAX_HEADER_LEN); } refer_header[0] = '\0'; refer_author[0] = '\0'; refer_date[0] = '\0'; refer_title[0] = '\0'; } /* =========================================== * === First Line Customizations ==== * =========================================== */ /* this means the first line of the file is the headline. useful for the lyrics server */ /* paragraph files - seperated by a blank line. Next line is the header */ char first_line_header[MAX_HEADER_LEN +1]; boolean first_line_separator_function(line) char *line; { return false; } void first_line_header_function(line) char *line; { if (first_line_header[0] == '\0') s_strncpy(first_line_header, line, MAX_HEADER_LEN); } void first_line_finish_header_function(header) char *header; { if (strlen(first_line_header) == 0) { strcpy(header, "No Title"); } else { s_strncpy(header, first_line_header, MAX_HEADER_LEN); } first_line_header[0] = 0; } /* ========================= * === BIBTEX Separator === * ========================= * S.P.vandeBurgt@research.ptt.nl (Stan) * * BibTeX entries * * @......{ * ...... * title = header * .......} * */ static char bibtex_header[MAX_HEADER_LEN + 1]; boolean bibtex_separator_function(line) char *line; { char *p = line; while (isspace(*p)) p++; /* skip space */ return(*p == '@'); } void bibtex_header_function(line) char *line; { char *p = line; p = strstr(line, "title"); if (p == NULL) p = strstr(line, "Title"); if (p == NULL) p = strstr(line, "TITLE"); if (p != NULL && (p == line || !isalpha(*(p-1)))) { p += 5; while (isspace(*p)) p++; /* skip space */ if (*p == '=') /* should be an '=' now */ { p++; /* skip bibtex char's */ while (isspace(*p) || *p == '"' || *p == '{') p++; strncpy(bibtex_header, p, MAX_HEADER_LEN); for (p = bibtex_header; *p != '\0'; p++) { /* replace bibtex char's */ if (*p == '\n' || *p == '"' || *p == '}' || *p == '{') { *p = ' '; } } } } } void bibtex_finish_header_function(header) char *header; { if (bibtex_header[0] == '\0') { strcpy(header, "Unknown Title"); } else{ strncpy(header, bibtex_header, MAX_HEADER_LEN); } bibtex_header[0] = '\0'; } /* ========================= * === NHYP Separator === * ========================= * S.P.vandeBurgt@research.ptt.nl (Stan) * Nhyp entries * * ?:? header * ...... * ...... * */ static char nhyp_header[MAX_HEADER_LEN + 1]; boolean nhyp_separator_function(line) char *line; { return(strstr(line, "?:?") != NULL); } void nhyp_header_function(line) char *line; { char *p = line; p = strstr(line, "?:?"); if (p != NULL) { p += 3; while (isspace(*p)) p++; /* skip space */ strncpy(nhyp_header, p, MAX_HEADER_LEN); trim_trailing_newline(nhyp_header); } } void nhyp_finish_header_function(header) char *header; { if (nhyp_header[0] == '\0') { strcpy(header, "Unknown Title"); } else{ strncpy(header, nhyp_header, MAX_HEADER_LEN); } nhyp_header[0] = '\0'; } /* ========================== * === Objective-C code === * ========================== */ #ifdef NeXT /* only do this if it is on a NeXT */ /*----------------------- FSA -------------------*/ #define fsa_max_edges 4 #define fsa_error_state (-1) typedef struct { int if_input; int then_goto; } fsa_edge; /* action (if non-NULL) is excuted before transfer to next state is made */ /* action takes as arg the int input that will decide the next state */ typedef struct { int default_goto; int n_edges; fsa_edge edges[fsa_max_edges]; int (*action)(); } fsa_vertex; int fsa_step(input, state_p, table) int input; int *state_p; fsa_vertex *table; { int next_state, e; int (*this_action)(); if(*state_p < 0) return(*state_p = fsa_error_state); this_action = table[*state_p].action; if(this_action) this_action(input); for(e=0; e<table[*state_p].n_edges; e++) if(input == table[*state_p].edges[e].if_input) { next_state = table[*state_p].edges[e].then_goto; break; } if(e >= table[*state_p].n_edges) next_state = table[*state_p].default_goto; if(next_state < 0) next_state = fsa_error_state; return(*state_p = next_state); } /* sends null char as last input, returns final state */ int fsa_run(s, state_p, table) char *s; int *state_p; fsa_vertex *table; { char *p; for(p=s; *p; p++) fsa_step((int) *p, state_p, table); fsa_step(0, state_p, table); return(*state_p); } /*----------------------- end FSA -------------------*/ static int wobjc_brace_level = 0; static int wobjc_paren_level = 0; static int wobjc_strip_state = 0; static int wobjc_context = 0; static boolean wobjc_separator = false; static char wobjc_class[MAX_HEADER_LEN+1]; static char *wobjc_class_end = 0; static char wobjc_header[MAX_HEADER_LEN+1]; static char *wobjc_header_end = 0; #define WOBJC_BLANK " \t\n\r" #define WOBJC_WORD "qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM_0123456789" /* Flag next line as separator, when context fsa says so. */ static int wobjc_separate(input) int input; { return(wobjc_separator = true); } /* FSA to parse objective-C constructs. */ static fsa_vertex wobjc_context_fsa[] = { { 0, 1, {{ '@', 1 }}}, /* look for objc constructs */ { 0, 1, {{ 'i', 20 }}}, { 3, 1, {{ ' ', 2 }}}, /* look for @imp class */ { 4, 1, {{ 'A', 3 }}}, { 4, 3, {{ '+', 6 },{ '-', 8 },{ '@', 10 }}},/* in @imp */ { 4, 3, {{ '+', 6 },{ '-', 8 },{ '@', 10 }}, wobjc_separate}, { 6, 1, {{ '{', 7 }}}, /* look for -method: */ { 5, 1, {{ '{', 7 }}}, { 8, 1, {{ '{', 9 }}}, /* look for +method: */ { 5, 1, {{ '{', 9 }}}, { 4, 1, {{ 'e', 11 }}}, /* look for @end of @imp */ { 4, 1, {{ 'n', 12 }}}, { 4, 1, {{ 'd', 0 }}}, { 14, 1, {{ ' ', 13 }}}, /* look for @intf class */ { 15, 1, {{ 'A', 14 }}}, { 15, 1, {{ '@', 16 }}}, /* in @intf */ { 15, 1, {{ 'e', 17 }}}, /* look for @end of @intf */ { 15, 1, {{ 'n', 18 }}}, { 15, 1, {{ 'd', 19 }}}, { 0, 1, {{ '@', 1 }}, wobjc_separate}, { 0, 2, {{ 'm', 21 },{ 'n', 33 }}}, /* look for @impl */ { 0, 1, {{ 'p', 22 }}}, { 0, 1, {{ 'l', 23 }}}, { 0, 1, {{ 'e', 24 }}}, { 0, 1, {{ 'm', 25 }}}, { 0, 1, {{ 'e', 26 }}}, { 0, 1, {{ 'n', 27 }}}, { 0, 1, {{ 't', 28 }}}, { 0, 1, {{ 'a', 29 }}}, { 0, 1, {{ 't', 30 }}}, { 0, 1, {{ 'i', 31 }}}, { 0, 1, {{ 'o', 32 }}}, { 0, 1, {{ 'n', 2 }}}, { 0, 1, {{ 't', 34 }}}, /* look for @intf */ { 0, 1, {{ 'e', 35 }}}, { 0, 1, {{ 'r', 36 }}}, { 0, 1, {{ 'f', 37 }}}, { 0, 1, {{ 'a', 38 }}}, { 0, 1, {{ 'c', 39 }}}, { 0, 1, {{ 'e', 13 }}} }; /* Action to be used by stripping fsa in non-commented, non-quoted state. */ /* This runs context fsa. */ static int wobjc_process_stripped_code(input) int input; { int context_input; switch(input) { /* Increment brace/paren levels as appropriate. */ case '{': wobjc_brace_level++; break; case '}': if(wobjc_brace_level > 0) wobjc_brace_level--; break; case '(': wobjc_paren_level++; break; case ')': if(wobjc_paren_level > 0) wobjc_paren_level--; break; case '\"': break; case '\'': break; case '/': break; default: /* If in correct context and not in brace/paren/comment/quote, */ /* then record header info. */ if(wobjc_brace_level==0 && wobjc_paren_level==0) { /* Recording class or instance method. Ignore multiple blanks. */ if(wobjc_context==6 || wobjc_context==8) { if(!wobjc_header_end || wobjc_header_end==wobjc_header) { strcpy(wobjc_header, (wobjc_context==6 ? "+[" : "-[")); strcat(wobjc_header, wobjc_class); strcat(wobjc_header, " "); wobjc_header_end = wobjc_header+strlen(wobjc_header); } if((wobjc_header_end - wobjc_header)<(MAX_HEADER_LEN-5) && !(strchr(WOBJC_BLANK, *(wobjc_header_end-1)) && strchr(WOBJC_BLANK, input))) { *wobjc_header_end+= input; *wobjc_header_end = 0; } } /* Recording class name for @implementation or @interface. */ if(strchr(WOBJC_WORD, input) && (wobjc_context==2 || wobjc_context==3 || wobjc_context==13 || wobjc_context==14)) { if(wobjc_context==2 || wobjc_context==13 || !wobjc_class_end) wobjc_class_end = wobjc_class; if(wobjc_context==13 || (wobjc_context==14 && !wobjc_header_end)) wobjc_header_end = wobjc_header; if((wobjc_class_end - wobjc_class_end)<(MAX_HEADER_LEN/2)) { *wobjc_class_end+= input; *wobjc_class_end = 0; } if((wobjc_context==13 || wobjc_context==14) && (wobjc_header_end-wobjc_header_end)<(MAX_HEADER_LEN/2)) { *wobjc_header_end+= input; *wobjc_header_end = 0; } } } /* Since not in comment/quote, run context fsa. */ /* Input is modified like this: */ /* Non-zero brace level => '{'. */ /* Else spaces => ' '. */ /* Else if in correct contexts, word letters => 'A'. */ context_input = input; if(wobjc_brace_level>0) context_input = '{'; else if(strchr(WOBJC_BLANK, input)) context_input = ' '; else if((wobjc_context==3 || wobjc_context==14) && strchr(WOBJC_WORD, input)) context_input = 'A'; fsa_step(context_input, &wobjc_context, wobjc_context_fsa); break; } return(true); } /* FSA to strip out comments and quotes. */ static fsa_vertex wobjc_strip_fsa[] = { { 0, 3, {{ '/', 1 },{ '\"', 5 },{ '\'', 7 }}, wobjc_process_stripped_code}, { 0, 2, {{ '*', 2 },{ '/', 4 }}}, /* look for comment */ { 2, 1, {{ '*', 3 }}}, /* in /* comment */ { 2, 2, {{ '/', 0 },{ '*', 3 }}}, { 4, 1, {{ '\n', 0 }, { '\0', 0 }}}, /* in // comment */ { 5, 2, {{ '\\', 6 },{ '\"', 0 }}}, /* in " quote */ { 5, 0, }, { 7, 2, {{ '\\', 8 },{ '\'', 0 }}}, /* in ' quote */ { 7, 0, } }; boolean wobjc_separator_function(line) char *line; { if(wobjc_separator) { wobjc_separator = false; return true; } else return false; } void wobjc_header_function(line) char *line; { /* Run stripping fsa, which will run context fsa. */ fsa_run(line, &wobjc_strip_state, wobjc_strip_fsa); return; } void wobjc_finish_header_function(header) char *header; { char *p; /* Flush terminal blanks and balance opening '[' if any. */ for(p=wobjc_header+strlen(wobjc_header); p>wobjc_header && strchr(WOBJC_BLANK, *(p-1)); p--); if(wobjc_header[0]=='+' || wobjc_header[0]=='-') *p+= ']'; *p = 0; /* Copy out final header. */ strcpy(header, wobjc_header); wobjc_header[0] = 0; wobjc_header_end = wobjc_header; return; } #endif /* def NeXT */ /* ============================== * === Ziff computer select === * ============================== */ /* these filters index ziff computer select cd-rom files that have been offloaded from the CDROM. This is for indexing the CACM files that have been explicitly ok'ed by ACM. All other uses would violate the no lan access restrictions of Ziff */ #define ZIFF_TITLE_MARKER_1 "Title: " #define ZIFF_TITLE_MARKER_2 "Company: " #define ZIFF_FIRST_LINE_MARKER " *****" /* just use the title */ boolean ziff_separator_function(line) char *line; { if (strstr(line, ZIFF_FIRST_LINE_MARKER)) { return(true); } else{ return(false); } } char ziff_header[MAX_HEADER_LEN + 1]; void ziff_header_function(line) char *line; { if (strstr(line, ZIFF_TITLE_MARKER_1) || strstr(line, ZIFF_TITLE_MARKER_2)) { strncpy(ziff_header, line + strlen(ZIFF_TITLE_MARKER_1), MAX_HEADER_LEN); } } void ziff_finish_header_function(header) char *header; { if(strlen(ziff_header) == 0){ strcpy(header, "Unknown Title"); } else{ s_strncpy(header, ziff_header, MAX_HEADER_LEN); } ziff_header[0] = '\0'; } /* special header function for filename only type */ void filename_finish_header_function(header) char* header; { char *p = strrchr(current_filename, '/'); if (p != NULL) { p++; } else { p = current_filename; } s_strncpy(header, p, MAX_HEADER_LEN); } /* ============================== * === SCO ERG mail threads Customizations === * hess * ============================== */ /* separator_function is called on every line to see if it * separates documents. */ static int erg_thread_line_no = 0; static int lines_to_use = 0; static int saw_act_queue = 0; static int size_of_act_queue = 0; #define INIT register char *sp = instring; #define GETC() (*sp++) #define PEEKC() (*sp) #define UNGETC(c) (--sp) #define RETURN(c) return; #define ERROR(c) regerr() #include <regexp.h> regerr() { printf("oops, regerr()\n"); } #define ESIZE 256 static char expbuf[ESIZE] = { "" } ; static char Customer[MAX_HEADER_LEN + 1] = { "" }; static char Support_Reps[MAX_HEADER_LEN + 1]= { "" }; static char descript[MAX_HEADER_LEN + 1]= { "" }; static char erg_id[MAX_HEADER_LEN + 1]= { "" }; static int once = 0; static int saw_descript = 0; boolean erg_thread_separator_function(line) char *line; { if ( !once ) { once++; compile( "PROBLEM", expbuf, &expbuf[ESIZE], '\0'); } if(lines_to_use==0) { /* get from environment, default 300 */ char *env = getenv("LINES_TO_INDEX"); lines_to_use = ( env ? atoi(env) : 300 ); } erg_thread_line_no++; /* the threads (at least from support ) * have a section that we want to skip, it starts with : Active OS queue ========================== 1. ..... * This contains every open item on the queue, so a query on customer * name will turn up in many threads that are unrelated. * I just skip till a "From" line from the occurance of "Active OS queue" */ if ( substrcmp(line, "Active OS queue") ) { saw_act_queue = 1; } if ( saw_act_queue && substrcmp(line,"From") ) { saw_act_queue = 0; } if ( saw_act_queue ) { line[0] = '\0'; /* null the line, so it won't be indexed */ } /* since one thread is one object, we never see a seperator. * just check our total indexed size */ if (erg_thread_line_no >= lines_to_use ) { /* seek ahead to end of file, allows us to skip indexing * bulk of very large files. * fp = input_stream, size = file_size */ extern FILE *input_stream; fseek(input_stream , 0L, SEEK_END); } return(false); } /* * header_function is called on every line so that a headline * can be accumulated. This assumes that it will side effect global * variables. */ void erg_thread_header_function(line) char *line; { if( (strlen(line) > strlen("Subject: ")) && substrcmp(line, "Subject: ") ) { if (strlen(mail_subject) == 0) { strcpy(mail_subject, "Re: "); s_strncat(mail_subject, line + strlen("Subject: "), MAX_HEADER_LEN, MAX_HEADER_LEN); trim_trailing_newline(mail_subject); } /* since this is a subject, look for the erg#### so we can put it in the header */ if (strlen(erg_id) == 0 ) { char *p = line; while ( p[0] && p[1] && p[2] && p[3] && p[4] && p[5] && p[6] ) { if (p[0] == 'e' && p[1] == 'r' && p[2] == 'g' && (p[3] >= '0' && p[3] <= '9') && (p[4] >= '0' && p[4] <= '9') && (p[5] >= '0' && p[5] <= '9') && (p[6] >= '0' && p[6] <= '9') ) { strcpy(erg_id,p); *p = erg_id[7] = '\0'; } p++; } } } else if((strlen(line) > strlen("From: ")) && substrcmp(line, "From: ") && (strlen(mail_from) == 0)){ /* this should find the <foo@bar> field in the from list */ s_strncpy(mail_from, line + strlen("From: "), MAX_HEADER_LEN); trim_trailing_newline(mail_from); } /* grab the customer name if we see it ! */ if ( strlen(Customer) == 0 && (substrcmp(line, "Customer") || substrcmp(line, "Company") ) && (strlen(line) > strlen("Customer")) ) { char *p = strchr(line,':'); if (*p == ':') *p++; while( *p && (*p == ' ' || *p == '\t') ) *p++; s_strncpy(Customer, p, MAX_HEADER_LEN); #define CUST_MAX 15 if(strlen(Customer) > CUST_MAX) Customer[CUST_MAX] = '\0'; trim_trailing_newline(Customer); } } /* * finish_header_function is called when the document is finished * (by separator function responding TRUE or EOF) this will return * the headline string or NULL. * Presumably finish_header_function will use the * effects of header_function. finish_header_function * will only be called once, so it should clear whatever state * header_function has set. */ void erg_thread_finish_header_function(header) char *header; { header[0] = '\0'; if(strlen(mail_subject) != 0 && strlen(mail_from) != 0){ /* trim the from line if needed */ if(strlen(mail_from) > 10){ mail_from[10] = '\0'; } s_strncpy(header, mail_from, MAX_HEADER_LEN); s_strncat(header, " ", MAX_HEADER_LEN, MAX_HEADER_LEN); s_strncat(header, mail_subject, MAX_HEADER_LEN, MAX_HEADER_LEN); } else if(strlen(mail_subject) != 0){ s_strncpy(header, mail_subject, MAX_HEADER_LEN); } else if(strlen(mail_from) != 0){ s_strncpy(header, mail_from, MAX_HEADER_LEN); } else{ strcpy(header, "Unknown Subject"); } if( strlen(Customer) > 0 ) { strcat(Customer, " "); strcat(Customer, header); strcpy(header, Customer); } if ( strlen(erg_id) > 0 ) { strcat(header, " "); strcat(header, erg_id); } /* if( strlen(descript) > 0 ) strcat(header, descript); */ #ifdef DEBUG printf(" <%s>\n",header); #endif /* clean up */ mail_from[0] = '\0'; mail_subject[0] = '\0'; descript[0] = '\0'; Customer[0] = '\0'; erg_id[0] = '\0'; erg_thread_line_no = 0; saw_act_queue = 0; saw_descript = 0; } /* change descripts */ /* * header_function is called on every line so that a headline * can be accumulated. This assumes that it will side effect global * variables. */ /* the header here will be the first line after * b. Description of the problem or new feature : * * now, we can also look for more info, author, product, date etc... * STILL WORKING HERE ..., hess */ static int saw_description= 0; static int saw_name= 0; static char prod_name[MAX_HEADER_LEN + 1]= { "" }; void erg_cd_header_function(line) char *line; { static char *pat = 0 ; static char *prod = 0 ; char *p = line; char *cursor; if ( !pat ) { pat = regcmp( "Description of the problem", (char *)0 ); prod = regcmp( "Product Name[ \t]*:", (char *)0 ); } while (isspace(*p)) p++; /* skip space */ if ( saw_description ==0 && descript[0] == '\0' && strlen(p) > 15 && (cursor= regex(pat, p)) ) { saw_description++; return; } else /* look for the product name * this is found on the same line as the pattern * use cursor to move into the string */ if ( saw_name == 0 && prod_name[0] == '\0' && strlen(p) > 10 && (cursor = regex(prod, p)) ) { p = cursor; while (isspace(*p)) p++; /* skip space */ s_strncpy(prod_name, p, 13 ); trim_trailing_newline(prod_name); /* need to trim trailing spaces or tabs ! here also ? */ return; } /* we have seen our special string, collect the * next non blank line */ if(saw_description && descript[0] == '\0' ) { if ( p && strlen(p) > 0 ) { s_strncpy(descript, p, MAX_HEADER_LEN); return; } } } /* * finish_header_function is called when the document is finished * (by separator function responding TRUE or EOF) this will return * the headline string or NULL. * Presumably finish_header_function will use the * effects of header_function. finish_header_function * will only be called once, so it should clear whatever state * header_function has set. */ void erg_cd_finish_header_function(header) char *header; { header[0] = '\0'; if(prod_name[0] != '\0') { s_strncpy(header, prod_name, MAX_HEADER_LEN); s_strncat(header, ", ", MAX_HEADER_LEN, MAX_HEADER_LEN); } s_strncat(header, descript, MAX_HEADER_LEN, MAX_HEADER_LEN); /* reset for next file. */ descript[0] = '\0'; prod_name[0] = '\0'; saw_description=0; saw_name=0; if(header[0] == '\0') { /* oops, no header found */ filename_finish_header_function(header); } trim_trailing_newline(header); /* for debugging */ #ifdef DEBUG fprintf(stderr, " <%s>\n",header); #endif }